Developed by Deepa M, Data Science, iPEC Solutions Private Limited Bangalore¶

In [83]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/Users/Manoj kumar/OneDrive/Desktop/Weather_Data_Final.csv')
In [85]:
df.head()
Out[85]:
last_updated_epoch country location_name region latitude longitude timezone last_updated sunrise sunset ... precip_in humidity cloud feels_like_celsius feels_like_fahrenheit visibility_km visibility_miles uv_index gust_mph gust_kph
0 1693000001 India Ashoknagar Madhya Pradesh 24.57 77.72 Asia/Kolkata 29-08-2023 10:45 5:59 AM 6:41 PM ... 0.0 67 26 29.7 85.5 10.0 6 7 14.8 23.8
1 1693000002 India Raisen Madhya Pradesh 23.33 77.80 Asia/Kolkata 29-08-2023 10:45 6:00 AM 6:40 PM ... 0.0 70 19 30.0 86.0 10.0 6 7 11.2 18.0
2 1693000003 India Chhindwara Madhya Pradesh 22.07 78.93 Asia/Kolkata 29-08-2023 10:45 5:56 AM 6:34 PM ... 0.0 70 51 28.2 82.8 10.0 6 7 13.2 21.2
3 1693000004 India Betul Madhya Pradesh 21.86 77.93 Asia/Kolkata 29-08-2023 10:45 6:00 AM 6:38 PM ... 0.0 76 65 27.6 81.7 10.0 6 6 13.0 20.9
4 1693000005 India Hoshangabad Madhya Pradesh 22.75 77.72 Asia/Kolkata 29-08-2023 10:45 6:01 AM 6:39 PM ... 0.0 74 82 29.9 85.8 10.0 6 6 11.6 18.7

5 rows × 41 columns

In [87]:
df.isnull().sum()
Out[87]:
last_updated_epoch              0
country                         0
location_name                   0
region                          0
latitude                        0
longitude                       0
timezone                        0
last_updated                    0
sunrise                         0
sunset                          0
moonrise                        0
moonset                         0
moon_phase                      0
moon_illumination               0
temperature_celsius             0
temperature_fahrenheit          0
wind_mph                        0
wind_kph                        0
wind_degree                     0
wind_direction                  0
pressure_mb                     0
pressure_in                     0
air_quality_Carbon_Monoxide     0
air_quality_Ozone               0
air_quality_Nitrogen_dioxide    0
air_quality_Sulphur_dioxide     0
air_quality_PM2.5               0
air_quality_PM10                0
air_quality_us-epa-index        0
air_quality_gb-defra-index      0
precip_mm                       0
precip_in                       0
humidity                        0
cloud                           0
feels_like_celsius              0
feels_like_fahrenheit           0
visibility_km                   0
visibility_miles                0
uv_index                        0
gust_mph                        0
gust_kph                        0
dtype: int64
In [89]:
df.duplicated().sum()
Out[89]:
0
In [91]:
# Convert last_updated to datetime
df['last_updated'] = pd.to_datetime(df['last_updated'], format='%d-%m-%Y %H:%M')

# Convert sunrise & sunset to string before concatenation
df['sunrise'] = pd.to_datetime(
    df['last_updated'].dt.date.astype(str) + ' ' + df['sunrise'].astype(str), 
    format='%Y-%m-%d %I:%M %p', errors='coerce'
)

df['sunset'] = pd.to_datetime(
    df['last_updated'].dt.date.astype(str) + ' ' + df['sunset'].astype(str), 
    format='%Y-%m-%d %I:%M %p', errors='coerce'
)
In [93]:
df.duplicated().sum()
Out[93]:
0
In [95]:
df = df.drop(columns=['temperature_fahrenheit', 'last_updated_epoch'])
In [97]:
df.head()
Out[97]:
country location_name region latitude longitude timezone last_updated sunrise sunset moonrise ... precip_in humidity cloud feels_like_celsius feels_like_fahrenheit visibility_km visibility_miles uv_index gust_mph gust_kph
0 India Ashoknagar Madhya Pradesh 24.57 77.72 Asia/Kolkata 2023-08-29 10:45:00 2023-08-29 05:59:00 2023-08-29 18:41:00 5:42 PM ... 0.0 67 26 29.7 85.5 10.0 6 7 14.8 23.8
1 India Raisen Madhya Pradesh 23.33 77.80 Asia/Kolkata 2023-08-29 10:45:00 2023-08-29 06:00:00 2023-08-29 18:40:00 5:39 PM ... 0.0 70 19 30.0 86.0 10.0 6 7 11.2 18.0
2 India Chhindwara Madhya Pradesh 22.07 78.93 Asia/Kolkata 2023-08-29 10:45:00 2023-08-29 05:56:00 2023-08-29 18:34:00 5:32 PM ... 0.0 70 51 28.2 82.8 10.0 6 7 13.2 21.2
3 India Betul Madhya Pradesh 21.86 77.93 Asia/Kolkata 2023-08-29 10:45:00 2023-08-29 06:00:00 2023-08-29 18:38:00 5:36 PM ... 0.0 76 65 27.6 81.7 10.0 6 6 13.0 20.9
4 India Hoshangabad Madhya Pradesh 22.75 77.72 Asia/Kolkata 2023-08-29 10:45:00 2023-08-29 06:01:00 2023-08-29 18:39:00 5:38 PM ... 0.0 74 82 29.9 85.8 10.0 6 6 11.6 18.7

5 rows × 39 columns

In [99]:
print(df.describe())  # Statistics for numerical columns
print(df['region'].value_counts())  # Count of unique regions
           latitude     longitude                   last_updated  \
count  24070.000000  24070.000000                          24070   
mean      23.099527     80.237659  2023-09-19 16:39:05.234732032   
min        8.080000     68.970000            2023-08-29 10:15:00   
25%       20.270000     76.070000            2023-09-09 04:15:00   
50%       23.970000     78.680000            2023-09-20 04:00:00   
75%       26.770000     83.900000            2023-09-30 03:30:00   
max       34.570000     95.800000            2023-10-11 03:15:00   
std        5.796304      5.757138                            NaN   

                             sunrise                         sunset  \
count                          24070                          24070   
mean   2023-09-19 17:59:16.788533248  2023-09-20 06:12:12.830078976   
min              2023-08-29 04:44:00            2023-08-29 17:30:00   
25%              2023-09-09 04:54:00            2023-09-09 17:24:15   
50%              2023-09-20 04:58:30            2023-09-20 17:13:00   
75%              2023-09-30 06:39:45            2023-09-30 18:39:00   
max              2023-10-11 06:48:00            2023-10-11 18:34:00   
std                              NaN                            NaN   

       moon_illumination  temperature_celsius      wind_mph      wind_kph  \
count       24070.000000         24070.000000  24070.000000  24070.000000   
mean           54.981180            24.323166      5.512256      8.875347   
min             0.000000            -5.300000      2.200000      3.600000   
25%            22.000000            23.000000      2.900000      4.700000   
50%            56.000000            24.800000      4.700000      7.600000   
75%            91.000000            26.500000      6.900000     11.200000   
max           100.000000            38.300000     26.800000     43.200000   
std            34.926868             3.901488      3.270520      5.257236   

        wind_degree  ...     precip_in      humidity         cloud  \
count  24070.000000  ...  24070.000000  24070.000000  24070.000000   
mean     186.925634  ...      0.011052     80.593353     45.492480   
min        1.000000  ...      0.000000     10.000000      0.000000   
25%       90.000000  ...      0.000000     72.000000     13.000000   
50%      216.000000  ...      0.000000     85.000000     46.000000   
75%      270.000000  ...      0.000000     93.000000     75.000000   
max      360.000000  ...      1.720000    100.000000    100.000000   
std      101.276077  ...      0.047257     15.589535     33.665745   

       feels_like_celsius  feels_like_fahrenheit  visibility_km  \
count        24070.000000           24070.000000   24070.000000   
mean            26.498608              79.692368       8.311525   
min             -8.800000              16.300000       0.000000   
25%             25.100000              77.200000       9.000000   
50%             26.900000              80.400000      10.000000   
75%             29.300000              84.700000      10.000000   
max             50.500000             122.900000      10.000000   
std              4.975163               8.954775       3.046278   

       visibility_miles      uv_index      gust_mph      gust_kph  
count      24070.000000  24070.000000  24070.000000  24070.000000  
mean           4.886290      1.420357      9.546710     15.363830  
min            0.000000      1.000000      0.000000      0.000000  
25%            5.000000      1.000000      5.600000      9.100000  
50%            6.000000      1.000000      8.900000     14.400000  
75%            6.000000      1.000000     12.500000     20.200000  
max            6.000000      9.000000     47.000000     75.600000  
std            1.956184      1.522510      5.228817      8.414274  

[8 rows x 31 columns]
region
Uttar Pradesh                  2687
Madhya Pradesh                 2110
Rajasthan                      1597
Andhra Pradesh                 1437
Maharashtra                    1413
Bihar                          1276
Tamil Nadu                     1232
Orissa                         1084
Karnataka                      1017
Chhattisgarh                   1012
Gujarat                        1012
Assam                           926
West Bengal                     922
Haryana                         880
Jammu and Kashmir               879
Punjab                          836
Jharkhand                       792
Kerala                          616
Uttarakhand                     484
Manipur                         353
Mizoram                         308
Nagaland                        264
Himachal Pradesh                220
Tripura                          96
Goa                              89
Delhi                            88
Puducherry                       88
Arunachal Pradesh                88
Daman and Diu                    88
Andaman and Nicobar Islands      44
Lakshadweep                      44
Dadra and Nagar Haveli           44
Chandigarh                       44
Name: count, dtype: int64
In [101]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24070 entries, 0 to 24069
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   country                       24070 non-null  object        
 1   location_name                 24070 non-null  object        
 2   region                        24070 non-null  object        
 3   latitude                      24070 non-null  float64       
 4   longitude                     24070 non-null  float64       
 5   timezone                      24070 non-null  object        
 6   last_updated                  24070 non-null  datetime64[ns]
 7   sunrise                       24070 non-null  datetime64[ns]
 8   sunset                        24070 non-null  datetime64[ns]
 9   moonrise                      24070 non-null  object        
 10  moonset                       24070 non-null  object        
 11  moon_phase                    24070 non-null  object        
 12  moon_illumination             24070 non-null  int64         
 13  temperature_celsius           24070 non-null  float64       
 14  wind_mph                      24070 non-null  float64       
 15  wind_kph                      24070 non-null  float64       
 16  wind_degree                   24070 non-null  int64         
 17  wind_direction                24070 non-null  object        
 18  pressure_mb                   24070 non-null  int64         
 19  pressure_in                   24070 non-null  float64       
 20  air_quality_Carbon_Monoxide   24070 non-null  float64       
 21  air_quality_Ozone             24070 non-null  float64       
 22  air_quality_Nitrogen_dioxide  24070 non-null  float64       
 23  air_quality_Sulphur_dioxide   24070 non-null  float64       
 24  air_quality_PM2.5             24070 non-null  float64       
 25  air_quality_PM10              24070 non-null  float64       
 26  air_quality_us-epa-index      24070 non-null  int64         
 27  air_quality_gb-defra-index    24070 non-null  int64         
 28  precip_mm                     24070 non-null  float64       
 29  precip_in                     24070 non-null  float64       
 30  humidity                      24070 non-null  int64         
 31  cloud                         24070 non-null  int64         
 32  feels_like_celsius            24070 non-null  float64       
 33  feels_like_fahrenheit         24070 non-null  float64       
 34  visibility_km                 24070 non-null  float64       
 35  visibility_miles              24070 non-null  int64         
 36  uv_index                      24070 non-null  int64         
 37  gust_mph                      24070 non-null  float64       
 38  gust_kph                      24070 non-null  float64       
dtypes: datetime64[ns](3), float64(19), int64(9), object(8)
memory usage: 7.2+ MB
In [103]:
# To import warnings

import warnings
warnings.filterwarnings("ignore")

Univariate Analysis¶

In [106]:
# 1. What is the distribution of temperature in Celsius?

import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  # Create a 2x2 subplot layout
sns.histplot(df['temperature_celsius'], bins=30, kde=True, color='blue', ax=axes[0,0]).set(title="Histogram")
sns.boxplot(x=df['temperature_celsius'], color='red', ax=axes[0,1]).set(title="Box Plot")
sns.kdeplot(df['temperature_celsius'], fill=True, color='green', ax=axes[1,0]).set(title="KDE Plot")
sns.violinplot(x=df['temperature_celsius'], palette='magma', ax=axes[1,1]).set(title="Violin Plot")

plt.tight_layout()  # Adjust layout for better spacing
plt.show()
No description has been provided for this image
In [107]:
# 2. What is the average humidity level?

import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Histogram', 'Boxplot', 'Violinplot', 'KDE']

sns.histplot(df['humidity'], kde=True, color='blue', ax=axes[0, 0]).set_title(plots[0])
sns.boxplot(x=df['humidity'], color='green', ax=axes[0, 1]).set_title(plots[1])
sns.violinplot(x=df['humidity'], color='purple', ax=axes[1, 0]).set_title(plots[2])
sns.kdeplot(df['humidity'], fill=True, color='red', ax=axes[1, 1]).set_title(plots[3])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [110]:
# 3. What is the most common cloud cover percentage?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Histogram', 'Boxplot', 'Violinplot', 'KDE']

sns.histplot(df['cloud'], bins=20, kde=True, color='blue', ax=axes[0, 0]).set_title(plots[0])
sns.boxplot(x=df['cloud'], color='green', ax=axes[0, 1]).set_title(plots[1])
sns.violinplot(x=df['cloud'], palette='coolwarm', ax=axes[1, 0]).set_title(plots[2])
sns.kdeplot(df['cloud'], fill=True, color='purple', ax=axes[1, 1]).set_title(plots[3])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [112]:
# 4. What is the earliest sunrise time recorded?

df['sunrise_time'] = pd.to_datetime(df['sunrise'], errors='coerce').dt.hour * 60 + pd.to_datetime(df['sunrise'], errors='coerce').dt.minute

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
plots = ['Histogram', 'Barplot', 'Scatterplot', 'Boxplot']

sns.histplot(df['sunrise_time'], bins=20, color='blue', ax=axes[0, 0]).set_title(plots[0])
sns.barplot(x=df['sunrise_time'][:10], y=range(10), palette='viridis', ax=axes[0, 1]).set_title(plots[1])
sns.scatterplot(x=range(len(df)), y=df['sunrise_time'], color='purple', ax=axes[1, 0]).set_title(plots[2])
sns.boxplot(y=df['sunrise_time'], palette='coolwarm', ax=axes[1, 1]).set_title(plots[3])

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [113]:
# 5. What is the maximum gust speed recorded in kph?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Histogram', 'Barplot', 'Scatterplot', 'Boxplot']

sns.histplot(df['gust_kph'], bins=20, color='blue', ax=axes[0, 0]).set_title(plots[0])
sns.barplot(x=df['gust_kph'][:10], y=range(10), palette='viridis', ax=axes[0, 1]).set_title(plots[1])
sns.scatterplot(x=range(len(df)), y=df['gust_kph'], color='purple', ax=axes[1, 0]).set_title(plots[2])
sns.boxplot(y=df['gust_kph'], palette='coolwarm', ax=axes[1, 1]).set_title(plots[3])

plt.tight_layout()
plt.show()
No description has been provided for this image
In [116]:
# 6. What is the moon phase mentioned in the dataset?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Countplot', 'Barplot', 'Pie Chart', 'Histogram']  

sns.countplot(x=df['moon_phase'], palette='coolwarm', ax=axes[0, 0]).set_title(plots[0])  
sns.barplot(x=df['moon_phase'].value_counts().index, y=df['moon_phase'].value_counts().values, palette='viridis', ax=axes[0, 1]).set_title(plots[1])  
df['moon_phase'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette("pastel"), ax=axes[1, 0], title=plots[2])  
sns.histplot(df['moon_phase'], bins=10, kde=True, palette='mako', ax=axes[1, 1]).set_title(plots[3])  

plt.tight_layout()
plt.show()
No description has been provided for this image
In [118]:
# 7. How is the UV index distributed across the dataset?

fig, axes = plt.subplots(2, 2, figsize=(12, 10)); plots = ['Histogram', 'Barplot', 'KDE Plot', 'Scatterplot']; 
sns.histplot(df['uv_index'], bins=20, kde=True, color='blue', ax=axes[0, 0]).set_title(plots[0]); 
sns.barplot(x=df['uv_index'].value_counts().index, y=df['uv_index'].value_counts().values, palette='viridis', ax=axes[0, 1]).set_title(plots[1]); 
sns.kdeplot(df['uv_index'], fill=True, color='green', ax=axes[1, 0]).set_title(plots[2]); 
sns.scatterplot(x=range(len(df)), y=df['uv_index'], color='red', ax=axes[1, 1]).set_title(plots[3]); 
plt.tight_layout()
No description has been provided for this image
In [120]:
# 8. What is the distribution of visibility in kilometers?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Histogram', 'Barplot', 'KDE Plot', 'Scatterplot']  
sns.histplot(df['visibility_km'], bins=20, color='blue', ax=axes[0, 0]).set_title(plots[0])  
sns.barplot(x=df['visibility_km'].value_counts().index[:10], y=df['visibility_km'].value_counts().values[:10], palette='viridis', ax=axes[0, 1]).set_title(plots[1])  
sns.kdeplot(df['visibility_km'], fill=True, color='green', ax=axes[1, 0]).set_title(plots[2])  
sns.scatterplot(x=range(len(df)), y=df['visibility_km'], color='red', ax=axes[1, 1]).set_title(plots[3])  
plt.tight_layout()  
plt.show()
No description has been provided for this image
In [121]:
# 9. What is the distribution of wind gust speeds in kph?

fig, axes = plt.subplots(2, 2, figsize=(12, 10)) 
plots = ['Histogram', 'Barplot', 'KDE Plot', 'Scatterplot']
sns.histplot(df['wind_kph'], bins=20, color='blue', ax=axes[0, 0]).set_title(plots[0])
sns.barplot(x=df['wind_kph'].value_counts().index[:10], y=df['wind_kph'].value_counts().values[:10], palette='viridis', ax=axes[0, 1]).set_title(plots[1])
sns.kdeplot(df['wind_kph'], fill=True, color='red', ax=axes[1, 0]).set_title(plots[2])
sns.scatterplot(x=range(len(df)), y=df['wind_kph'], color='purple', ax=axes[1, 1]).set_title(plots[3])
plt.tight_layout()
plt.show()
No description has been provided for this image
In [124]:
# 10. What is the range of "feels like" temperatures in Celsius?

fig, axes = plt.subplots(2, 2, figsize=(12, 10))  
plots = ['Histogram', 'Boxplot', 'KDE Plot', 'Violin Plot']  
sns.histplot(df['feels_like_celsius'], bins=20, color='blue', ax=axes[0, 0]).set_title(plots[0])  
sns.boxplot(x=df['feels_like_celsius'], palette='coolwarm', ax=axes[0, 1]).set_title(plots[1])  
sns.kdeplot(df['feels_like_celsius'], fill=True, color='green', ax=axes[1, 0]).set_title(plots[2])  
sns.violinplot(x=df['feels_like_celsius'], palette='magma', ax=axes[1, 1]).set_title(plots[3])  
plt.tight_layout()  
plt.show()  
No description has been provided for this image

Bivariate Analysis¶

In [127]:
# 1. How does temperature (in Celsius) correlate with humidity?

sns.lmplot(x="temperature_celsius", y="humidity", data=df, scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title("Correlation between Temperature (°C) and Humidity")
plt.show()
No description has been provided for this image
In [128]:
# 2. What is the relationship between wind speed (kph) and gust speed (kph)?

sns.heatmap(df[['wind_kph', 'gust_kph']].corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap: Wind Speed vs. Gust Speed")
plt.show()
No description has been provided for this image
In [131]:
# 3. What is the latitude and longitude of Indore?

df[df['location_name'] == 'Indore'][['latitude', 'longitude']].plot(kind='bar', color=['blue', 'orange'], figsize=(6,4), legend=True, title="Latitude & Longitude of Indore")
plt.xticks([])
plt.show()
No description has been provided for this image
In [133]:
# 4. How does the "feels like" temperature compare to actual temperature in Celsius?

sns.scatterplot(data=df, x='temperature_celsius', y='feels_like_celsius', hue='feels_like_celsius', palette='coolwarm').set_title("Feels Like vs Actual Temperature (°C)")
plt.show()
No description has been provided for this image
In [135]:
# 5. Is there a correlation between dew point temperature and humidity levels?

sns.scatterplot(data=df, x='temperature_celsius', y='humidity', hue='humidity', palette='viridis').set_title("Temperature vs Humidity")
plt.show()
No description has been provided for this image
In [137]:
# 6. How does sunrise time vary with different locations?

# Convert Sunrise column to datetime format
df['sunrise'] = pd.to_datetime(df['sunrise'])

# Select top 20 locations with the earliest sunrise
top_locations = df.groupby('location_name')['sunrise'].min().nsmallest(20).reset_index()

# Create a bar plot
plt.figure(figsize=(8, 4))
sns.barplot(data=top_locations, x='location_name', y=top_locations['sunrise'].dt.hour, palette='plasma')

plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.xlabel("Location")
plt.ylabel("Sunrise Hour")
plt.title("Variation of Sunrise Time by Location")
plt.show()
No description has been provided for this image
In [139]:
# 7. Find the temperature difference between actual and "feels like" values for top 50 location?

plt.figure(figsize=(12, 6))
sns.boxplot(data=df_top, x='location_name', 
            y=df_top['temperature_celsius'] - df_top['feels_like_celsius'], 
            palette='coolwarm')

plt.xticks(rotation=90)
plt.title("Temperature Difference (Actual - Feels Like) for Top 50 Locations")
plt.show()
No description has been provided for this image
In [141]:
# 8. How does cloud cover (%) vary with humidity (%) across top 50 locations?

# Selecting the top 50 locations
df_top = df[df["location_name"].isin(df["location_name"].unique()[:50])]

# Creating FacetGrid plot
g = sns.FacetGrid(df_top, col="location_name", col_wrap=5, height=3, sharex=True, sharey=True)
g.map(sns.scatterplot, "humidity", "cloud", alpha=0.6, color="blue")

g.set_axis_labels("Humidity (%)", "Cloud Cover (%)")
g.set_titles(col_template="{col_name}")
plt.subplots_adjust(top=0.9)
g.fig.suptitle("Cloud Cover vs Humidity Across Top 50 Locations", fontsize=16)
plt.show()
No description has been provided for this image
In [143]:
# 9. Determine the correlation between visibility and humidity?

sns.lmplot(x="humidity", y="visibility_km", data=df, aspect=1.5, scatter_kws={'alpha':0.6, 'color':'blue'}, line_kws={'color':'red'})
plt.xlabel("Humidity (%)")
plt.ylabel("Visibility (km)")
plt.title("Correlation between Visibility and Humidity")
plt.show()
No description has been provided for this image
In [145]:
# 10. What is the effect of UV index on temperature (Celsius)? 

sns.scatterplot(data=df, x="uv_index", y="temperature_celsius", hue="temperature_celsius", palette="coolwarm")
plt.title("Effect of UV Index on Temperature (Celsius)")
plt.show()
No description has been provided for this image

Multivariate Analysis¶

In [148]:
# 1. How do temperature (in Celsius), humidity, and wind speed (kph) interact with each other?

sns.pairplot(df, vars=['temperature_celsius', 'humidity', 'wind_kph'], palette='coolwarm', corner=True)
plt.show()
No description has been provided for this image
In [150]:
# 2. What is the relationship between air quality parameters (Carbon Monoxide, Ozone, PM2.5) and temperature?

sns.pairplot(df, vars=['air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_PM2.5', 'temperature_celsius'], palette='coolwarm', corner=True)
plt.show()
No description has been provided for this image
In [152]:
# 3. Is there a connection between sunrise time, sunset time, and temperature variations across different locations?

# Convert sunrise and sunset to numeric format (assuming they're in time format)
df['sunrise'] = pd.to_datetime(df['sunrise']).dt.hour
df['sunset'] = pd.to_datetime(df['sunset']).dt.hour

# Create correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df[['sunrise', 'sunset', 'temperature_celsius']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Between Sunrise, Sunset, and Temperature")
plt.show()
No description has been provided for this image
In [154]:
# 4. What is the combined effect of wind speed, gust speed, and cloud cover on temperature?

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='wind_kph', y='gust_kph', size='cloud', hue='temperature_celsius', palette='coolwarm', alpha=0.7, edgecolor='black', sizes=(20, 500))
plt.title("Effect of Wind Speed, Gust Speed & Cloud Cover on Temperature")
plt.xlabel("Wind Speed (kph)")
plt.ylabel("Gust Speed (kph)")
plt.legend(title="Cloud Cover & Temperature", loc='upper right', bbox_to_anchor=(1.25, 1))
plt.show()
No description has been provided for this image
In [156]:
# 5. Does latitude and longitude significantly impact temperature, humidity, and air quality?

plt.figure(figsize=(10,6))
sns.kdeplot(data=df, x='latitude', y='longitude', fill=True, cmap="coolwarm", levels=50)
plt.title("KDE Plot of Latitude & Longitude with Temperature Density")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.show()
No description has been provided for this image
In [158]:
# 6. How does UV index vary based on temperature, humidity, and cloud cover?

plt.figure(figsize=(10, 6))
heatmap_data = df.pivot_table(values="uv_index", index="temperature_celsius", columns="humidity")
sns.heatmap(heatmap_data, cmap="coolwarm", annot=True, fmt=".1f")

plt.xlabel("Humidity (%)")
plt.ylabel("Temperature (°C)")
plt.title("Heatmap of UV Index based on Temperature and Humidity")
plt.show()
No description has been provided for this image
In [160]:
# 7. What is the combined impact of sunrise time, sunset time, and moon illumination on daily temperature changes?

sns.pairplot(df, vars=['sunrise_time', 'sunset', 'moon_illumination', 'temperature_celsius'],  
             hue='moon_illumination', palette='coolwarm', diag_kind='kde')  

plt.suptitle('Pairplot: Sunrise, Sunset, Moon Illumination & Temperature', y=1.02)  
plt.show()
No description has been provided for this image
In [162]:
# 8. What is the relationship between wind direction, wind speed, and gust speed across different locations?

sns.pairplot(df, vars=["wind_kph", "gust_kph", "wind_degree"], palette="coolwarm", diag_kind="kde")
plt.suptitle("Pair Plot of Wind Speed, Gust Speed, and Wind Direction Across Locations", y=1.02)
plt.show()
No description has been provided for this image
In [164]:
# 9. How do pressure, temperature, and cloud cover affect the probability of precipitation?

# Selecting relevant columns for correlation
corr_matrix = df[["pressure_mb", "temperature_celsius", "cloud", "precip_mm"]].corr()

# Creating a heatmap
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Between Pressure, Temperature, Cloud Cover & Precipitation Probability")
plt.show()
No description has been provided for this image
In [166]:
# 10. What is the combined impact of sunrise time, sunset time, and moon illumination on daily temperature changes?

plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='sunrise', y='temperature_celsius', fill=True, cmap='coolwarm', alpha=0.7)
sns.kdeplot(data=df, x='sunset', y='temperature_celsius', fill=True, cmap='viridis', alpha=0.5)
sns.kdeplot(data=df, x='moon_illumination', y='temperature_celsius', fill=True, cmap='plasma', alpha=0.3)

plt.title("KDE Plot: Impact of Sunrise, Sunset & Moon Illumination on Temperature")
plt.xlabel("Time / Illumination")
plt.ylabel("Temperature (°C)")
plt.show()
No description has been provided for this image

Modelling¶

Random Forest Regression¶

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
In [188]:
# Select features (excluding non-relevant categorical columns)
features = ['wind_kph', 'pressure_mb', 'humidity', 'cloud', 'visibility_km', 'uv_index', 'gust_kph']
target = 'temperature_celsius'
In [190]:
# Prepare data
X = df[features]
y = df[target]
In [192]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [194]:
# Initialize and train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
Out[194]:
RandomForestRegressor(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
In [196]:
# Predict on test data
y_pred = rf_model.predict(X_test)
In [198]:
# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")
Mean Absolute Error: 1.06
Root Mean Squared Error: 1.63
R-squared Score: 0.82

XGBoost Regression¶

In [201]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
In [203]:
# Select features (excluding non-relevant categorical columns)
features = ['wind_kph', 'pressure_mb', 'humidity', 'cloud', 'visibility_km', 'uv_index', 'gust_kph']
target = 'temperature_celsius'

# Prepare data
X = df[features]
y = df[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train XGBoost Regressor
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on test data using XGBoost
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost model performance
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regression Results:")
print(f"Mean Absolute Error: {mae_xgb:.2f}")
print(f"Root Mean Squared Error: {rmse_xgb:.2f}")
print(f"R-squared Score: {r2_xgb:.2f}")
XGBoost Regression Results:
Mean Absolute Error: 1.25
Root Mean Squared Error: 1.76
R-squared Score: 0.79

If you want to know more or run same code contact me¶

https://www.linkedin.com/in/deepa-m-9bb555361/¶

In [ ]: